library(car)
## Loading required package: carData
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(readxl)
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
#load the dataset
mydata <- read.csv('./data/day.csv')
#convert 'dteday' column to Date format
mydata$dteday <- as.Date(mydata$dteday)

#season
mydata$season <- cut(mydata$season,
                     breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
                     labels = c("Winter", "Spring", "Summer", "Fall"))
mydata$season <- factor(mydata$season, levels = c("Winter", "Spring", "Summer", "Fall"))

#workingday
mydata$workingday <- ifelse(mydata$workingday == 0, "Not_Workingday", "Workingday")
mydata$workingday <- factor(mydata$workingday, levels = c("Not_Workingday", "Workingday"))

#weather
mydata$weathersit <- cut(mydata$weathersit,
                         breaks = c(0.5, 1.5, 2.5, 3.5, 4.5),
                         labels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))
mydata$weathersit <- factor(mydata$weathersit, levels = c("Weather_1", "Weather_2", "Weather_3", "Weather_4"))

head(mydata)
##   instant     dteday season yr mnth holiday weekday     workingday weathersit
## 1       1 2011-01-01 Winter  0    1       0       6 Not_Workingday  Weather_2
## 2       2 2011-01-02 Winter  0    1       0       0 Not_Workingday  Weather_2
## 3       3 2011-01-03 Winter  0    1       0       1     Workingday  Weather_1
## 4       4 2011-01-04 Winter  0    1       0       2     Workingday  Weather_1
## 5       5 2011-01-05 Winter  0    1       0       3     Workingday  Weather_1
## 6       6 2011-01-06 Winter  0    1       0       4     Workingday  Weather_1
##       temp    atemp      hum windspeed casual registered  cnt
## 1 0.344167 0.363625 0.805833 0.1604460    331        654  985
## 2 0.363478 0.353739 0.696087 0.2485390    131        670  801
## 3 0.196364 0.189405 0.437273 0.2483090    120       1229 1349
## 4 0.200000 0.212122 0.590435 0.1602960    108       1454 1562
## 5 0.226957 0.229270 0.436957 0.1869000     82       1518 1600
## 6 0.204348 0.233209 0.518261 0.0895652     88       1518 1606

We are splitting the dataset in 2 parts, first part is year 2011 and the second part is year 2012. Then we are splitting subset of year 2011 to be 50% training dataset and 50% validation dataset. The year 2012 dataset will be used for prediction.

# Set a seed for reproducibility
# Setting a seed is crucial for reproducibility and consistency of results.
# It ensures that when you run the same code multiple times, you'll ...
# ... obtain the same results.
set.seed(20231103)
# Filter dataset for year 2011 (0) and year 2012 (1)
year2011 <- mydata[mydata$yr == 0, ]
year2012 <- mydata[mydata$yr == 1, ]

# Add 'Type' column and assign values
year2011$Type <- "Training"
year2012$Type <- "Validation"

# Combine training and validation data
mydata <- rbind(year2011, year2012)

We choose 3 questions of interest:

  1. How does working day affect the number of registered users? where Y is the numnber of registered users
  2. How does season affect the number of casual users? where Y is the sqrt(casual users)
  3. Is weather correlated to the number of bike rentals?* where Y is the count of bikes rented

Boxplots for proposed covariates y

#registered
ggplot(mydata, aes(x = Type, y = registered, color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "# Registered Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("# Registered Users by Type (Training vs. Validation)") +
  theme_bw()

#sqrt(casual)
ggplot(mydata, aes(x = Type, y = sqrt(casual), color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "Square Root # Casual Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("Square Root # Casual Users by Type (Training vs. Validation)") +
  theme_bw()

#cnt
ggplot(mydata, aes(x = Type, y = cnt, color = Type)) +
  geom_boxplot(position = position_dodge(width = 0.8)) +
  labs(x = "Type", y = "# Total Users") +
  scale_fill_manual(name = "Dataset Type", values = c("Training" = "blue", "Validation" = "red")) +
  ggtitle("# Total Users by Type (Training vs. Validation)") +
  theme_bw()

# Function to calculate statistics
calculate_statistics <- function(x) {
  return(c(
    Min = min(x, na.rm = TRUE),
    Q1 = quantile(x, 0.25, na.rm = TRUE),
    Median = median(x, na.rm = TRUE),
    Mean = mean(x, na.rm = TRUE),
    Q3 = quantile(x, 0.75, na.rm = TRUE),
    Max = max(x, na.rm = TRUE)
  ))
}
#creating training data our of mydata (training -> data from 2011)
training_data <- subset(mydata, Type == "Training")
summary(training_data)
##     instant        dteday              season         yr         mnth       
##  Min.   :  1   Min.   :2011-01-01   Winter:90   Min.   :0   Min.   : 1.000  
##  1st Qu.: 92   1st Qu.:2011-04-02   Spring:92   1st Qu.:0   1st Qu.: 4.000  
##  Median :183   Median :2011-07-02   Summer:94   Median :0   Median : 7.000  
##  Mean   :183   Mean   :2011-07-02   Fall  :89   Mean   :0   Mean   : 6.526  
##  3rd Qu.:274   3rd Qu.:2011-10-01               3rd Qu.:0   3rd Qu.:10.000  
##  Max.   :365   Max.   :2011-12-31               Max.   :0   Max.   :12.000  
##     holiday          weekday               workingday      weathersit 
##  Min.   :0.0000   Min.   :0.000   Not_Workingday:115   Weather_1:226  
##  1st Qu.:0.0000   1st Qu.:1.000   Workingday    :250   Weather_2:124  
##  Median :0.0000   Median :3.000                        Weather_3: 15  
##  Mean   :0.0274   Mean   :3.008                        Weather_4:  0  
##  3rd Qu.:0.0000   3rd Qu.:5.000                                       
##  Max.   :1.0000   Max.   :6.000                                       
##       temp             atemp              hum           windspeed      
##  Min.   :0.05913   Min.   :0.07907   Min.   :0.0000   Min.   :0.02239  
##  1st Qu.:0.32500   1st Qu.:0.32195   1st Qu.:0.5383   1st Qu.:0.13558  
##  Median :0.47917   Median :0.47285   Median :0.6475   Median :0.18690  
##  Mean   :0.48666   Mean   :0.46684   Mean   :0.6437   Mean   :0.19140  
##  3rd Qu.:0.65667   3rd Qu.:0.61238   3rd Qu.:0.7421   3rd Qu.:0.23508  
##  Max.   :0.84917   Max.   :0.84090   Max.   :0.9725   Max.   :0.50746  
##      casual         registered        cnt           Type          
##  Min.   :   9.0   Min.   : 416   Min.   : 431   Length:365        
##  1st Qu.: 222.0   1st Qu.:1730   1st Qu.:2132   Class :character  
##  Median : 614.0   Median :2915   Median :3740   Mode  :character  
##  Mean   : 677.4   Mean   :2728   Mean   :3406                     
##  3rd Qu.: 871.0   3rd Qu.:3632   3rd Qu.:4586                     
##  Max.   :3065.0   Max.   :4614   Max.   :6043
#creating training data our of mydata (validatioin -> data from 2012)
validation_data <- subset(mydata, Type == "Validation")
summary(validation_data)
##     instant          dteday              season         yr         mnth       
##  Min.   :366.0   Min.   :2012-01-01   Winter:91   Min.   :1   Min.   : 1.000  
##  1st Qu.:457.2   1st Qu.:2012-04-01   Spring:92   1st Qu.:1   1st Qu.: 4.000  
##  Median :548.5   Median :2012-07-01   Summer:94   Median :1   Median : 7.000  
##  Mean   :548.5   Mean   :2012-07-01   Fall  :89   Mean   :1   Mean   : 6.514  
##  3rd Qu.:639.8   3rd Qu.:2012-09-30               3rd Qu.:1   3rd Qu.: 9.750  
##  Max.   :731.0   Max.   :2012-12-31               Max.   :1   Max.   :12.000  
##     holiday           weekday               workingday      weathersit 
##  Min.   :0.00000   Min.   :0.000   Not_Workingday:116   Weather_1:237  
##  1st Qu.:0.00000   1st Qu.:1.000   Workingday    :250   Weather_2:123  
##  Median :0.00000   Median :3.000                        Weather_3:  6  
##  Mean   :0.03005   Mean   :2.986                        Weather_4:  0  
##  3rd Qu.:0.00000   3rd Qu.:5.000                                       
##  Max.   :1.00000   Max.   :6.000                                       
##       temp            atemp             hum           windspeed      
##  Min.   :0.1075   Min.   :0.1017   Min.   :0.2542   Min.   :0.04665  
##  1st Qu.:0.3477   1st Qu.:0.3507   1st Qu.:0.5081   1st Qu.:0.13372  
##  Median :0.5142   Median :0.4978   Median :0.6119   Median :0.17475  
##  Mean   :0.5041   Mean   :0.4819   Mean   :0.6122   Mean   :0.18957  
##  3rd Qu.:0.6540   3rd Qu.:0.6076   3rd Qu.:0.7111   3rd Qu.:0.23120  
##  Max.   :0.8617   Max.   :0.8049   Max.   :0.9250   Max.   :0.44156  
##      casual         registered        cnt           Type          
##  Min.   :   2.0   Min.   :  20   Min.   :  22   Length:366        
##  1st Qu.: 429.8   1st Qu.:3730   1st Qu.:4369   Class :character  
##  Median : 904.5   Median :4776   Median :5927   Mode  :character  
##  Mean   :1018.5   Mean   :4581   Mean   :5600                     
##  3rd Qu.:1262.0   3rd Qu.:5663   3rd Qu.:7011                     
##  Max.   :3410.0   Max.   :6946   Max.   :8714

Full models for each question

We are fitting full model which will be used to stepwise regression later. We address multicollinearity issue and delete variables that might cause it.

Note: I am not sure which of the variables should be deleted and which should be kept in the model to make it better.

full_model1 <- lm(registered ~ as.factor(workingday) + as.factor(season)+ holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model2 <- lm( sqrt(casual)~ as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

full_model3 <- lm( cnt ~ casual +as.factor(workingday) + as.factor(season) + holiday + weekday + temp + hum + windspeed + as.factor(weathersit),
                 data = training_data)

summary(full_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1831.18  -249.18    45.54   306.01  1029.65 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       939.50     182.22   5.156 4.21e-07 ***
## as.factor(workingday)Workingday   698.55      55.36  12.617  < 2e-16 ***
## as.factor(season)Spring           812.16      92.94   8.738  < 2e-16 ***
## as.factor(season)Summer           794.67     121.99   6.514 2.52e-10 ***
## as.factor(season)Fall            1306.52      82.32  15.872  < 2e-16 ***
## holiday                          -196.65     156.92  -1.253 0.210974    
## weekday                            17.57      12.46   1.410 0.159397    
## temp                             2757.41     236.16  11.676  < 2e-16 ***
## hum                              -610.45     230.43  -2.649 0.008431 ** 
## windspeed                       -1469.23     351.94  -4.175 3.76e-05 ***
## as.factor(weathersit)Weather_2   -224.29      66.00  -3.398 0.000756 ***
## as.factor(weathersit)Weather_3  -1377.56     146.15  -9.426  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 471.3 on 353 degrees of freedom
## Multiple R-squared:  0.8083, Adjusted R-squared:  0.8024 
## F-statistic: 135.3 on 11 and 353 DF,  p-value: < 2.2e-16
summary(full_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.0473  -3.0247  -0.0296   2.7154  18.3344 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      19.89493    1.84453  10.786  < 2e-16 ***
## as.factor(workingday)Workingday -11.35502    0.56044 -20.261  < 2e-16 ***
## as.factor(season)Spring           6.26683    0.94083   6.661 1.05e-10 ***
## as.factor(season)Summer           3.36323    1.23484   2.724  0.00678 ** 
## as.factor(season)Fall             4.43365    0.83329   5.321 1.84e-07 ***
## holiday                          -2.56032    1.58848  -1.612  0.10790    
## weekday                           0.09252    0.12616   0.733  0.46381    
## temp                             32.15458    2.39057  13.451  < 2e-16 ***
## hum                              -6.06923    2.33258  -2.602  0.00966 ** 
## windspeed                       -14.22007    3.56256  -3.992 7.99e-05 ***
## as.factor(weathersit)Weather_2   -2.09773    0.66811  -3.140  0.00183 ** 
## as.factor(weathersit)Weather_3   -8.83239    1.47944  -5.970 5.78e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.771 on 353 degrees of freedom
## Multiple R-squared:  0.8019, Adjusted R-squared:  0.7957 
## F-statistic: 129.9 on 11 and 353 DF,  p-value: < 2.2e-16
summary(full_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     holiday + weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2086.82  -250.88    43.02   316.95   994.65 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      6.822e+02  1.847e+02   3.694 0.000255 ***
## casual                           1.393e+00  8.172e-02  17.048  < 2e-16 ***
## as.factor(workingday)Workingday  9.622e+02  7.673e+01  12.540  < 2e-16 ***
## as.factor(season)Spring          7.127e+02  9.250e+01   7.705 1.35e-13 ***
## as.factor(season)Summer          7.396e+02  1.189e+02   6.221 1.40e-09 ***
## as.factor(season)Fall            1.250e+03  8.071e+01  15.488  < 2e-16 ***
## holiday                         -1.457e+02  1.526e+02  -0.955 0.340318    
## weekday                          1.545e+01  1.210e+01   1.277 0.202431    
## temp                             2.167e+03  2.599e+02   8.338 1.72e-15 ***
## hum                             -4.957e+02  2.248e+02  -2.205 0.028098 *  
## windspeed                       -1.132e+03  3.485e+02  -3.248 0.001273 ** 
## as.factor(weathersit)Weather_2  -1.881e+02  6.446e+01  -2.918 0.003754 ** 
## as.factor(weathersit)Weather_3  -1.256e+03  1.440e+02  -8.721  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.2 on 352 degrees of freedom
## Multiple R-squared:  0.8937, Adjusted R-squared:  0.8901 
## F-statistic: 246.6 on 12 and 352 DF,  p-value: < 2.2e-16
#plot added variables
avPlots(full_model1)

avPlots(full_model2)

avPlots(full_model3)

Stepwise Regression

stepwise_model1 <- step(full_model1, direction = "both")
## Start:  AIC=4505.31
## registered ~ as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - holiday                1    348826  78757811 4504.9
## <none>                                78408985 4505.3
## - weekday                1    441649  78850634 4505.4
## - hum                    1   1558913  79967898 4510.5
## - windspeed              1   3871169  82280153 4520.9
## - as.factor(weathersit)  2  19753550  98162534 4583.3
## - temp                   1  30282108 108691093 4622.5
## - as.factor(workingday)  1  35361497 113770481 4639.2
## - as.factor(season)      3  60396050 138805035 4707.8
## 
## Step:  AIC=4504.93
## registered ~ as.factor(workingday) + as.factor(season) + weekday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                78757811 4504.9
## - weekday                1    509480  79267291 4505.3
## + holiday                1    348826  78408985 4505.3
## - hum                    1   1478325  80236136 4509.7
## - windspeed              1   3852838  82610649 4520.4
## - as.factor(weathersit)  2  19849983  98607794 4583.0
## - temp                   1  30118568 108876379 4621.1
## - as.factor(workingday)  1  39599907 118357718 4651.6
## - as.factor(season)      3  60285076 139042887 4706.4
stepwise_model2 <- step(full_model2, direction = "both")
## Start:  AIC=1152.44
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## - weekday                1      12.2  8046.8 1151.0
## <none>                                8034.6 1152.4
## - holiday                1      59.1  8093.7 1153.1
## - hum                    1     154.1  8188.7 1157.4
## - windspeed              1     362.6  8397.2 1166.5
## - as.factor(weathersit)  2     829.0  8863.6 1184.3
## - as.factor(season)      3    1441.3  9475.9 1206.7
## - temp                   1    4117.9 12152.4 1301.5
## - as.factor(workingday)  1    9343.4 17378.0 1432.0
## 
## Step:  AIC=1151
## sqrt(casual) ~ as.factor(workingday) + as.factor(season) + holiday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq     RSS    AIC
## <none>                                8046.8 1151.0
## - holiday                1      63.9  8110.7 1151.9
## + weekday                1      12.2  8034.6 1152.4
## - hum                    1     164.6  8211.5 1156.4
## - windspeed              1     359.7  8406.5 1165.0
## - as.factor(weathersit)  2     818.8  8865.6 1182.4
## - as.factor(season)      3    1445.3  9492.1 1205.3
## - temp                   1    4107.9 12154.7 1299.5
## - as.factor(workingday)  1    9348.6 17395.5 1430.4
stepwise_model3 <- step(full_model3, direction = "both")
## Start:  AIC=4484.06
## cnt ~ casual + as.factor(workingday) + as.factor(season) + holiday + 
##     weekday + temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - holiday                1    190554  73761582 4483.0
## - weekday                1    340857  73911885 4483.7
## <none>                                73571028 4484.1
## - hum                    1   1016235  74587264 4487.1
## - windspeed              1   2205158  75776187 4492.8
## - temp                   1  14531329  88102358 4547.9
## - as.factor(weathersit)  2  15979331  89550360 4551.8
## - as.factor(workingday)  1  32868919 106439948 4616.9
## - as.factor(season)      3  54301778 127872806 4679.8
## - casual                 1  60747394 134318422 4701.8
## 
## Step:  AIC=4483
## cnt ~ casual + as.factor(workingday) + as.factor(season) + weekday + 
##     temp + hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## - weekday                1    383340  74144922 4482.9
## <none>                                73761582 4483.0
## + holiday                1    190554  73571028 4484.1
## - hum                    1    963292  74724874 4485.7
## - windspeed              1   2178046  75939628 4491.6
## - temp                   1  14386687  88148269 4546.0
## - as.factor(weathersit)  2  15987090  89748672 4550.6
## - as.factor(workingday)  1  35790437 109552019 4625.4
## - as.factor(season)      3  54174909 127936491 4678.0
## - casual                 1  61516967 135278549 4702.4
## 
## Step:  AIC=4482.9
## cnt ~ casual + as.factor(workingday) + as.factor(season) + temp + 
##     hum + windspeed + as.factor(weathersit)
## 
##                         Df Sum of Sq       RSS    AIC
## <none>                                74144922 4482.9
## + weekday                1    383340  73761582 4483.0
## + holiday                1    233037  73911885 4483.7
## - hum                    1   1087039  75231961 4486.2
## - windspeed              1   2121892  76266814 4491.2
## - temp                   1  14190777  88335699 4544.8
## - as.factor(weathersit)  2  15737193  89882115 4549.2
## - as.factor(workingday)  1  36139539 110284461 4625.8
## - as.factor(season)      3  54380050 128524972 4677.7
## - casual                 1  62035815 136180737 4702.8
summary(stepwise_model1)
## 
## Call:
## lm(formula = registered ~ as.factor(workingday) + as.factor(season) + 
##     weekday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1834.15  -238.70    46.37   304.93  1029.11 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       911.40     180.98   5.036 7.59e-07 ***
## as.factor(workingday)Workingday   715.86      53.66  13.341  < 2e-16 ***
## as.factor(season)Spring           815.27      92.98   8.768  < 2e-16 ***
## as.factor(season)Summer           798.73     122.04   6.545 2.09e-10 ***
## as.factor(season)Fall            1305.98      82.38  15.853  < 2e-16 ***
## weekday                            18.82      12.43   1.513  0.13110    
## temp                             2748.78     236.25  11.635  < 2e-16 ***
## hum                              -593.43     230.21  -2.578  0.01035 *  
## windspeed                       -1465.70     352.21  -4.161 3.97e-05 ***
## as.factor(weathersit)Weather_2   -229.81      65.91  -3.487  0.00055 ***
## as.factor(weathersit)Weather_3  -1381.03     146.24  -9.444  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 471.7 on 354 degrees of freedom
## Multiple R-squared:  0.8075, Adjusted R-squared:  0.802 
## F-statistic: 148.5 on 10 and 354 DF,  p-value: < 2.2e-16
summary(stepwise_model2)
## 
## Call:
## lm(formula = sqrt(casual) ~ as.factor(workingday) + as.factor(season) + 
##     holiday + temp + hum + windspeed + as.factor(weathersit), 
##     data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.7811  -2.9288  -0.0188   2.7108  18.2147 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      20.2681     1.7718  11.439  < 2e-16 ***
## as.factor(workingday)Workingday -11.3579     0.5601 -20.280  < 2e-16 ***
## as.factor(season)Spring           6.2838     0.9399   6.685 8.99e-11 ***
## as.factor(season)Summer           3.3963     1.2332   2.754  0.00619 ** 
## as.factor(season)Fall             4.4543     0.8323   5.352 1.57e-07 ***
## holiday                          -2.6529     1.5824  -1.677  0.09452 .  
## temp                             32.1003     2.3879  13.443  < 2e-16 ***
## hum                              -6.2416     2.3192  -2.691  0.00746 ** 
## windspeed                       -14.1586     3.5593  -3.978 8.43e-05 ***
## as.factor(weathersit)Weather_2   -2.0546     0.6651  -3.089  0.00217 ** 
## as.factor(weathersit)Weather_3   -8.7637     1.4755  -5.939 6.84e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.768 on 354 degrees of freedom
## Multiple R-squared:  0.8016, Adjusted R-squared:  0.796 
## F-statistic:   143 on 10 and 354 DF,  p-value: < 2.2e-16
summary(stepwise_model3)
## 
## Call:
## lm(formula = cnt ~ casual + as.factor(workingday) + as.factor(season) + 
##     temp + hum + windspeed + as.factor(weathersit), data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2060.85  -257.74    44.82   332.76  1011.50 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      7.189e+02  1.775e+02   4.050 6.29e-05 ***
## casual                           1.403e+00  8.153e-02  17.210  < 2e-16 ***
## as.factor(workingday)Workingday  9.826e+02  7.480e+01  13.136  < 2e-16 ***
## as.factor(season)Spring          7.157e+02  9.258e+01   7.730 1.12e-13 ***
## as.factor(season)Summer          7.474e+02  1.189e+02   6.285 9.65e-10 ***
## as.factor(season)Fall            1.252e+03  8.076e+01  15.501  < 2e-16 ***
## temp                             2.135e+03  2.594e+02   8.231 3.60e-15 ***
## hum                             -5.094e+02  2.236e+02  -2.278  0.02331 *  
## windspeed                       -1.110e+03  3.486e+02  -3.183  0.00159 ** 
## as.factor(weathersit)Weather_2  -1.840e+02  6.418e+01  -2.867  0.00439 ** 
## as.factor(weathersit)Weather_3  -1.243e+03  1.438e+02  -8.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 457.7 on 354 degrees of freedom
## Multiple R-squared:  0.8928, Adjusted R-squared:  0.8898 
## F-statistic:   295 on 10 and 354 DF,  p-value: < 2.2e-16

model 1: R^2 = 0.8075

model 2: R^2 = 0.8016

model 3: R^2 = 0.8928

avPlots(stepwise_model1)

avPlots(stepwise_model2)

avPlots(stepwise_model3)

For vif, values close to 1 indicate no multicollinearity, values between 1-3 indicate mild collinearity which is not ideal but can be considered ok since it won’t severaly impact the interpretability of the coefficients, values above indicate severe multicollinearity which is considered unreliable.

vif(stepwise_model1)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.019319  1        1.009613
## as.factor(season)     3.639198  3        1.240226
## weekday               1.017995  1        1.008957
## temp                  3.282494  1        1.811766
## hum                   1.918458  1        1.385084
## windspeed             1.199915  1        1.095406
## as.factor(weathersit) 1.834088  2        1.163737
vif(stepwise_model2)
##                           GVIF Df GVIF^(1/(2*Df))
## as.factor(workingday) 1.086910  1        1.042550
## as.factor(season)     3.640064  3        1.240276
## holiday               1.071406  1        1.035087
## temp                  3.282132  1        1.811666
## hum                   1.905606  1        1.380437
## windspeed             1.199328  1        1.095138
## as.factor(weathersit) 1.826326  2        1.162504
vif(stepwise_model3)
##                           GVIF Df GVIF^(1/(2*Df))
## casual                3.575021  1        1.890773
## as.factor(workingday) 2.104307  1        1.450623
## as.factor(season)     3.879645  3        1.253522
## temp                  4.203539  1        2.050253
## hum                   1.922350  1        1.386488
## windspeed             1.248688  1        1.117447
## as.factor(weathersit) 1.881977  2        1.171261

Prediction

validation_data$predicted_registered <- predict(stepwise_model1, newdata = validation_data)
validation_data$predicted_sqrtcasual <- predict(stepwise_model2, newdata = validation_data)
validation_data$predicted_count <- predict(stepwise_model3, newdata = validation_data)

Note: here in the predicitons I am unsure why the firts one is negative, in thesecond quetsion, I am unsure whether I should square the original value or not (not sure wthere it predicts the squared casaul count or not). In the third question, it produces Nans???

observed_values1 <- validation_data$registered
predicted_values1 <- validation_data$predicted_registered
# Calculate different prediction performance metrics
# Functions from the MLmetrics package
# Common regression metrics
# Calculate the Root Mean Squared Error (RMSE), which measures the ...
# ... average magnitude of prediction errors.
# Lower is better.
rmse1 <- RMSE(predicted_values1, observed_values1)
# same as
# sqrt(mean((observed_values - predicted_values)^2))

# Compute the Mean Absolute Error (MAE), indicating the average absolute ...
# ... difference between predicted and observed values.
# Lower is better.
mae1 <- MAE(predicted_values1, observed_values1)
# same as
# mean(abs(observed_values - predicted_values))

# Calculate the Mean Absolute Percentage Error (MAPE), measuring the ...
# ... average percentage difference between predicted and observed values.
mape1 <- MAPE(predicted_values1, observed_values1)
# same as
# mean(abs(predicted_values-observed_values)/observed_values)

# Determine the R-squared (R²) Score, representing the proportion of the ...
# ... variance in the observed values (of validation data set) ... 
# ... explained by the predicted values from the model.
# Higher is better.
r_squared1 <- R2_Score(predicted_values1, observed_values1)
# same as
# summary(lm(observed_values ~ predicted_values))$r.squared

# Display the calculated metrics
cat("Root Mean Squared Error (RMSE):", round(rmse1, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1929.403
cat("Mean Absolute Error (MAE):", round(mae1, digits = 4), "\n")
## Mean Absolute Error (MAE): 1799.901
cat("R-squared (R^2) Score:", round(r_squared1, digits = 4), "\n")
## R-squared (R^2) Score: -0.84
cat("Mean Absolute Percentage Error (MPE):", round(mape1, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6342
#second question
observed_values2 <- sqrt(validation_data$casual) #should I transform it here??
predicted_values2 <- validation_data$predicted_sqrtcasual
rmse2 <- RMSE(predicted_values2, observed_values2)
mae2 <- MAE(predicted_values2, observed_values2)
mape2 <- MAPE(predicted_values2, observed_values2)
r_squared2 <- R2_Score(predicted_values2, observed_values2)
cat("Root Mean Squared Error (RMSE):", round(rmse2, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 7.5939
cat("Mean Absolute Error (MAE):", round(mae2, digits = 4), "\n")
## Mean Absolute Error (MAE): 5.9561
cat("R-squared (R^2) Score:", round(r_squared2, digits = 4), "\n")
## R-squared (R^2) Score: 0.5865
cat("Mean Absolute Percentage Error (MPE):", round(mape2, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.2126
#third question
observed_values3 <- validation_data$count
predicted_values3 <- validation_data$predicted_count
rmse3 <- RMSE(predicted_values3, observed_values3)
mae3 <- MAE(predicted_values3, observed_values3)
mape3 <- MAPE(predicted_values3, observed_values3)
r_squared3 <- R2_Score(predicted_values3, observed_values3)
## Warning in mean.default(y_true): argument is not numeric or logical: returning
## NA
cat("Root Mean Squared Error (RMSE):", round(rmse3, digits = 4), "\n")
## Root Mean Squared Error (RMSE): NaN
cat("Mean Absolute Error (MAE):", round(mae3, digits = 4), "\n")
## Mean Absolute Error (MAE): NaN
cat("R-squared (R^2) Score:", round(r_squared3, digits = 4), "\n")
## R-squared (R^2) Score: NaN
cat("Mean Absolute Percentage Error (MPE):", round(mape3, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): NaN

not working for model 3??

Diagnostics of Training model

we choose model 1 because it has best RMSE and R^2 (it is done for whole 2011 year)

#done on model 1
m.mlr <- lm(registered ~ as.factor(workingday) + as.factor(season) + holiday + 
    temp + hum + windspeed + as.factor(weathersit),
                 data = year2011)
diagnostics_df <- data.frame(
  Residuals = resid(m.mlr),
  Fitted_Values = fitted(m.mlr),
  Standardized_Residuals = rstandard(m.mlr),
  Leverage = hatvalues(m.mlr)
  #Date = year2011$dteday
)

# Create the standardized residuals vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = Residuals)) +
  geom_point(col="blue", alpha=0.75) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "Residuals") +
  theme_bw()

# Create the QQ plot
ggplot(diagnostics_df, aes(sample = Standardized_Residuals)) +
  stat_qq(aes(sample = Standardized_Residuals), distribution = qnorm,
          size = 2, col="blue", alpha = 0.75) +
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
  labs(title = "Standardized Residual QQ Plot",
       x = "Theoretical Quantiles", y = "Sample Quantiles") +
  theme_bw()

# Create the sqrt(|standardized residuals|) vs. fitted values plot
ggplot(diagnostics_df, aes(x = Fitted_Values, y = sqrt(abs(Standardized_Residuals)))) +
  geom_point(col="blue", alpha=0.75) +
  labs( title = "Residuals vs. Fitted Values",
        x = "Fitted Values", y = "sqrt(|Standardized Residuals|)") +
  theme_bw()

# Leverage vs Standardized Residuals
ggplot(diagnostics_df, aes(x = Leverage, y = Standardized_Residuals)) +
  geom_point(alpha = 0.75) +
  labs(title = "Standardized Residuals vs. Leverage Plot",
       x = "Leverage", y = "Standardized Residuals") +
  theme_bw()

# Prediction

validation2012<- subset(validation_data)
validation2012$predicted_registered <- predict(m.mlr, newdata = validation2012)
summary(validation2012$predicted_registered)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -631.1  2260.8  3073.1  2829.8  3535.6  4281.9
# Extract observed and predicted values
observed_values <- validation2012$registered
predicted_values <- validation2012$predicted_registered
rmse <- RMSE(predicted_values, observed_values)
mae <- MAE(predicted_values, observed_values)
mape <- MAPE(predicted_values, observed_values)
r_squared <- R2_Score(predicted_values, observed_values)


cat("Root Mean Squared Error (RMSE):", round(rmse, digits = 4), "\n")
## Root Mean Squared Error (RMSE): 1930.15
cat("Mean Absolute Error (MAE):", round(mae, digits = 4), "\n")
## Mean Absolute Error (MAE): 1797.757
cat("R-squared (R^2) Score:", round(r_squared, digits = 4), "\n")
## R-squared (R^2) Score: -0.8414
cat("Mean Absolute Percentage Error (MPE):", round(mape, digits = 4), "\n")
## Mean Absolute Percentage Error (MPE): 0.6379
ggplot(validation2012, aes(x = registered, y = predicted_registered)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(x = "Observed Values", y = "Predicted Values",
       title = "Observed vs. Predicted Values") +
  theme_bw()

# Residuals plot
ggplot(validation2012, aes(x = 1:nrow(validation2012), y = registered-predicted_registered)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 0, color = "red", linetype = "dashed") +
  labs(x = "Observation Index", y = "Residuals",
       title = "Observed vs. Predicted Values") +
  theme_bw()